/*=======================================================================
Creator: Jingyuan Wang, jingyuanwang@u.northwestern.edu
Date created: 				07/19/2018
Date last modified: 		
Purpose: Assign naics codes to 12 sectors in Misato data

==========================================================================*/


*************************************************************************
* 				PART A. Get NAICS-ISIC code		 						*
*************************************************************************
import excel using "$crosswalks/2007_NAICS_to_ISIC_4.xls", sheet("NAICS 07 to ISIC 4 technical") first clear
drop F NoteslinkcontentbasedonNA
rename NAICSUS naics_2007
rename NAICSUSTITLE naics_2007_title2
rename ISIC40 isic_code
rename ISICRevision40Title isic_title
duplicates drop


*************************************************************************
* 				PART B. Merge NAICS-ISIC with our NAICS code		 	*
*************************************************************************
preserve
	use "$crosswalks/crosswalk_by_naics_year.dta", clear
	keep census_naics naics_2007 naics_2007_title
	duplicates drop
	tempfile code
	save `code', replace
restore

merge m:1 naics_2007 using `code.dta'
drop _merge

count if naics_2007_title2 != naics_2007_title & naics_2007_title != ""
* 0
drop naics_2007_title
rename naics_2007_title2 naics_2007_title

*************************************************************************
* 				PART C. Assign to 12 sectors by ISIC code				*
*************************************************************************
*drop if isic_code == "012X" // drop the none-digit one, also this one is not related to our research
replace isic_code = "0" + isic_code if strlen(isic_code) == 3

gen isic_2digit = substr(isic_code,1,2)

gen sector = ""
replace sector = "Iron and steel" 				if  isic_code == "2410" | isic_code == "2431"
replace sector = "Chemical and petrochemical" 	if  isic_2digit == "20" | isic_2digit == "21"
replace sector = "Non-ferrous metals" 			if  isic_code == "2420" | isic_code == "2432"
replace sector = "Non-metallic minerals" 		if  isic_2digit == "23"
replace sector = "Transport equipment" 			if  isic_2digit == "29" | isic_2digit == "30"
replace sector = "Machinery" 					if  isic_2digit == "25" | isic_2digit == "26" | isic_2digit == "27" |  isic_2digit == "28"
replace sector = "Mining and quarrying" 		if  isic_code == "0990" | isic_2digit == "07" | isic_2digit == "08"
replace sector = "Food and tobacco" 			if  isic_2digit == "10" | isic_2digit == "11" | isic_2digit == "12"
replace sector = "Paper, pulp and print" 		if  isic_2digit == "17" | isic_2digit == "18"
replace sector = "Wood and wood products" 		if  isic_2digit == "16" 
replace sector = "Construction" 				if  isic_2digit == "41" | isic_2digit == "42" | isic_2digit == "43"
replace sector = "Textile and leather" 			if  isic_2digit == "13" | isic_2digit == "14" | isic_2digit == "15"
replace sector = "Non-specified (Industry)" 	if  isic_2digit == "22" | isic_2digit == "31" | isic_2digit == "32"



*************************************************************************
* 				PART D. Export											*
*************************************************************************

* 1. clean and export a 400 rows all NAICS 31-33 industries codebook

keep if naics_2007 >= 310000 & naics_2007 <= 339999

* find out why there are 4 of the 96 industries with missing sector
tab naics_2007 if sector == ""

* These census_naics industry is assigned to 2 kind of ISIC code, one of which has a IEA sector name
drop if naics_2007 == 311711 & sector == ""
drop if naics_2007 == 311712 & sector == ""
drop if naics_2007 == 325110 & sector == ""
drop if naics_2007 == 325192 & sector == ""
drop if naics_2007 == 323114 & sector == ""
drop if naics_2007 == 324199 & sector == ""
drop if naics_2007 == 325110 & sector == ""
drop if naics_2007 == 325192 & sector == ""
drop if naics_2007 == 325998 & sector == ""
drop if naics_2007 == 336611 & sector == ""

* This census_naics industry is within 2-digit ISIC sector 19, and this is not included in IEA stats. So assign ISIC sector 20 to it.
replace sector = "Chemical and petrochemical" if naics_2007 == 324110 
replace sector = "Chemical and petrochemical" if naics_2007 == 324191 

* 312113 naics industry is within 2-digit ISIC sector 35, and this is not included in IEA stats.


* 2. drop duplications
keep if naics_2007 != .
keep  naics_2007 naics_2007_title sector
duplicates drop 
duplicates tag naics_2007, gen(dup)

* believe that there won't be heterogeneity within naics5 level 
preserve
	keep if dup == 0
	gen naics_5digit = floor(naics_2007/10)
	keep naics_5digit sector
	drop if sector == ""
	duplicates drop 
	duplicates tag naics_5digit, gen(dup)
	drop if naics_5digit == 32222 & sector != "Paper, pulp and print"
	drop if naics_5digit == 32419 & sector != "Chemical and petrochemical"
	drop if naics_5digit == 33461 & sector != "Paper, pulp and print"
	drop dup
	rename sector sector_5digit
	tempfile sector
	save `sector.dta'
restore

gen naics_5digit = floor(naics_2007/10)
merge m:1 naics_5digit using `sector.dta'

preserve
	keep if _merge == 1
	drop if naics_5digit == 31214 & sector != "Food and tobacco"
	drop if naics_5digit == 31332 & sector != "Textile and leather"
	drop if naics_5digit == 33242 & sector != "Machinery"
	drop if naics_5digit == 33422 & sector != "Machinery"
	drop if naics_5digit == 33429 & sector != "Machinery"
	drop if naics_5digit == 33633 & sector != "Transport equipment"
	drop if naics_5digit == 33634 & sector != "Transport equipment"
	drop if naics_5digit == 33635 & sector != "Transport equipment"
	drop if naics_5digit == 33636 & sector != "Transport equipment"
	drop if naics_5digit == 33792 & sector != "Non-specified (Industry)"
	drop if naics_5digit == 33992 & sector != "Non-specified (Industry)"
	drop if naics_5digit == 33995 & sector != "Non-specified (Industry)"
	keep naics_5digit sector
	rename sector sector_5digit
	duplicates drop
	tempfile sector2
	save `sector2.dta'
restore

use `sector.dta', clear
append using `sector2.dta'

* 3. output
sort naics_5digit
rename sector_5digit sector
order naics_5digit sector
save "$crosswalks/crosswalk_censusnaics_iea_allindustries.dta", replace


